The city venue data will be use to explore the cities.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
# initialize cities array and perform exploratory data analysis on the selected city
cities = ['Delhi','Mumbai','Kolkata','Chennai']
city = cities[1]
city_venues = pd.read_csv(city + '_venues.csv',index_col = 0)
city_venues.head()
# see number of venues per neighbourhood
city_venues.groupby('Neighborhood').count().head()
# one hot encoding
city_onehot = pd.get_dummies(city_venues[['Venue Category']], prefix="", prefix_sep="")
# add neighborhood column back to dataframe
city_onehot['Neighborhood'] = city_venues['Neighborhood']
# move neighborhood column to the first column
fixed_columns = [city_onehot.columns[-1]] + list(city_onehot.columns[:-1])
city_onehot = city_onehot[fixed_columns]
city_onehot['City'] = city
city_onehot.head()
# group the data per neighborhood
city_grouped = city_onehot.groupby('City').sum().reset_index()
city_grouped = city_grouped.transpose()
city_grouped.columns = city_grouped.iloc[0]
city_grouped.drop(city_grouped.index[[0]],inplace=True)
city_grouped.head()
city_grouped.sort_values([city],ascending=False,inplace=True)
city_grouped.iloc[0:10].plot(kind='bar',figsize=(10,5))
plt.title(city + ' Venue distribution')
plt.ylabel('Total no of venues in the City')
plt.xlabel('Venue Categories')
# First, let's write a function to sort the venues in descending order
def return_most_common_venues(row, num_top_venues):
row_categories = row.iloc[1:]
row_categories_sorted = row_categories.sort_values(ascending=False)
return row_categories_sorted.index.values[0:num_top_venues]
city_grouped = city_onehot.groupby('Neighborhood').mean().reset_index()
city_grouped.head()
# Now let's create the new dataframe and display the top 10 venues for each neighborhood.
num_top_venues = 10
indicators = ['st', 'nd', 'rd']
# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
try:
columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
except:
columns.append('{}th Most Common Venue'.format(ind+1))
# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = city_grouped['Neighborhood']
for ind in np.arange(city_grouped.shape[0]):
neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(city_grouped.iloc[ind, :], num_top_venues)
neighborhoods_venues_sorted.head()
# import necessary packages
from sklearn.datasets import load_iris
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
city_grouped_clustering = city_grouped.drop('Neighborhood', 1)
sse = {}
for k in range(1, 10):
# run k-means clustering
kmeans = KMeans(n_clusters=k, random_state=0,max_iter = 1000).fit(city_grouped_clustering)
city_grouped_clustering["clusters"] = kmeans.labels_
sse[k] = kmeans.inertia_ # Inertia: Sum of distances of samples to their closest cluster center
plt.figure()
plt.plot(list(sse.keys()), list(sse.values()))
plt.xlabel("Number of cluster")
plt.ylabel("SSE")
plt.show()
# set number of clusters
kclusters = 3
city_grouped_clustering = city_grouped.drop('Neighborhood', 1)
# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(city_grouped_clustering)
# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]
df = pd.read_csv(city + '_subdiv.csv',index_col=0)
df.head()
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)
city_merged = df
# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
city_merged = city_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')
# drop the column with nan values after join
city_merged.dropna(inplace=True)
city_merged.head() # check the last columns!
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors
import folium
# use geocoder library, if not present use !conda install -c conda-forge geocoder
import geocoder
# Google API key is required for the geocoder library to work, save the API key in OS environment variables as GOOGLE_API_KEY
# and then access thay key here
import os
# Use BING_API_KEY when choosing to use bing geocoding instead of google geocoding.
BING_API_KEY = 'AksNN-3luSfNBssyZ3Ju4i78nIrFLt1UtYo--YWQj9oyfxSwyXkdsqykWk3FeTXB' # os.environ['BING_API_KEY']
# This function will take an adress and return the latlng of that adress
def get_latlng(address):
# using bing geocoder API since it is better.
g = geocoder.bing(address, key = BING_API_KEY)
return pd.Series(g.latlng)
# get latitude and longitude of city to center the map
latitude, longitude = get_latlng(city)
print('Lat : ',latitude,' Long : ',longitude)
# Function takes in a data frame with Latitude, Longitude, Neighborhood and City columns and shows it on map
def visualize_area_in_map(data):
# add markers to map
for lat, lng, neighborhood, city in zip(data['Latitude'], data['Longitude'], data['Neighborhood'], data['City']):
label = '{}, {}'.format(neighborhood, city)
label = folium.Popup(label, parse_html=True)
folium.CircleMarker(
[lat, lng],
radius=2,
popup=label,
color='blue',
fill=True,
fill_color='#3186cc',
fill_opacity=0.7,
parse_html=False).add_to(map)
return map
# create map of Toronto using latitude and longitude values
map = folium.Map(location=[latitude, longitude], zoom_start=10)
# data to be used for map
data = df.dropna()
visualize_area_in_map(data)
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=10)
# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]
# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(city_merged['Latitude'], city_merged['Longitude'], city_merged['Neighborhood'], city_merged['Cluster Labels']):
label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
folium.CircleMarker(
[lat, lon],
radius=4,
popup=label,
color=rainbow[int(cluster)-1],
fill=True,
fill_color=rainbow[int(cluster)-1],
fill_opacity=0.9).add_to(map_clusters)
map_clusters
# print the cluster
city_merged.loc[city_merged['Cluster Labels'] == 0, city_merged.columns[[1] + list(range(5, city_merged.shape[1]))]]
city_merged.loc[city_merged['Cluster Labels'] == 1, city_merged.columns[[1] + list(range(5, city_merged.shape[1]))]]
city_merged.loc[city_merged['Cluster Labels'] == 2, city_merged.columns[[1] + list(range(5, city_merged.shape[1]))]]